In [18]:

    
from __future__ import print_function
import pandas as pd
import pickle
import numpy as np
from itertools import chain
from collections import OrderedDict
%load_ext autoreload
import random









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load the dataset!



In [22]:

    
# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized_SF.pickle')
review.head(5)
df_businesses = pd.read_pickle('../input/yelp_academic_dataset_business_SF.pickle')
city_state_list = list(set([df_businesses.city.iloc[i_city]+', '+df_businesses.state.iloc[i_city] for i_city, city in enumerate(df_businesses.city)]))[1:]
import pickle
pickle.dump(city_state_list, open('../output/city_state_list.pickle','wb'))



In [23]:

    
review.tail(5)









    Out[23]:






  
    
      
      business_id
      date
      review_id
      stars
      text
      type
      user_id
      votes_cool
      votes_funny
      votes_useful
      cleaned_tokenized
    
  
  
    
      84301
      02ef18a93c6b829f0c78790ce5709a3887fcd139
      NaN
      9cee722c-5489-46a7-b0cf-4b2c61e6a527
      4.0
      So my friends and I came one night, bringing m...
      NaN
      5291adc8-90e2-49ae-ad7b-57794f6c9a2c
      NaN
      NaN
      NaN
      [[friend, came, one, night, bringing, good, na...
    
    
      84302
      02ef18a93c6b829f0c78790ce5709a3887fcd139
      NaN
      773badd2-fd39-430a-ab32-22597ce1d76b
      5.0
      Everyone has a little drag queen inside of the...
      NaN
      5a921de7-ca35-4ca7-9125-4813cb88b1c2
      NaN
      NaN
      NaN
      [[everyone, a-little, drag, queen, inside, pla...
    
    
      84303
      02ef18a93c6b829f0c78790ce5709a3887fcd139
      NaN
      1ca8bee7-dd87-487e-bad1-2310f874bf5e
      3.0
      the venue itself is amazing.  it's huge! Sever...
      NaN
      10d6c7d4-c641-4db0-9f98-ca384ee69338
      NaN
      NaN
      NaN
      [[venue, amazing], [huge], [several, floor, eq...
    
    
      84304
      02ef18a93c6b829f0c78790ce5709a3887fcd139
      NaN
      83512718-a8a5-4a87-9d07-7badab3e32ae
      3.0
      Ooooo Ennis loved this place! S/he felt the co...
      NaN
      deb75413-f53d-4f35-a403-d7d0048e2c97
      NaN
      NaN
      NaN
      [[ooooo, loved, place], [felt, competition, ve...
    
    
      84305
      02ef18a93c6b829f0c78790ce5709a3887fcd139
      NaN
      83f4d507-9524-47d0-a337-7e67ba8093d5
      3.0
      Choosing a star rating for this is difficult a...
      NaN
      67290bdd-9cfb-4200-9586-3e04d62b6e02
      NaN
      NaN
      NaN
      [[choosing, star, rating, difficult, at-best],...



In [21]:

    
"deb75413-f53d-4f35-a403-d7d0048e2c97"









    Out[21]:





'deb75413-f53d-4f35-a403-d7d0048e2c97'



In [24]:

    
# Drop 20% of the users from the dataset for testing
user_set = list(set(review.user_id.values[:]))

random.seed(0)
shuffle(user_set) # Randomize 
n_users = float(len(user_set))

user_set_training = user_set[:int(n_users*float(0.8))]
with open('../output/training_users.pickle', 'wb') as f: 
    pickle.dump(user_set_training, f)
    
# Save a test set
test_users = user_set[int(n_users*float(0.8)):]
with open('../output/test_users.pickle', 'wb') as f: 
    pickle.dump(test_users, f)
    
# Make the active review set training only 
review = review[review.user_id.isin(user_set_training)]



In [11]:

    
"deb75413-f53d-4f35-a403-d7d0048e2c97" in user_set_training









    Out[11]:





True

Merging the documents by (i) business, (ii) users



In [25]:

    
# This is for review level not business level 
# docs = [" ".join(list(chain.from_iterable(l))) for l in review.cleaned_tokenized.iloc[:]]

n_reviews = -1 # all of them... 
# Flatten the reviews, so each review is just a single list of words.
reviews_merged_bus = OrderedDict()
business_set = set(review.business_id.values[:n_reviews])
for i_bus, bus_id in enumerate(business_set):
    if ((i_bus%2)==0):
        print ('\r Fraction Processed',float(i_bus+1)/len(business_set), end="") 
    # This horrible line first collapses each review of a corresponding business into a list
    # of lists, and then collapses the list of sentences to a long list of words
    reviews_merged_bus[bus_id] = " ".join(list(chain.from_iterable( 
                                    chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] ))))    
docs_bus = reviews_merged_bus.values()

with open('../output/docs_bars_bus.pickle', 'wb') as f: 
    pickle.dump(docs_bus, f)

with open('../output/bus_ids_bars_LDA.pickle', 'wb') as f: 
    pickle.dump(reviews_merged_bus.keys(), f)









    



 Fraction Processed 0.999822064057



In [71]:

Note that this section merges all reviews by the same person.



In [4]:

    
# Flatten the reviews, so each review is just a single list of words.
# reviews_merged_user = OrderedDict()

# user_set = list(set(review.user_id.values[:n_reviews]))
# n_users = float(len(user_set))
# for i_user, user_id in enumerate(user_set[:]):
#     if ((i_user%50)==0):
#         print ('\r Fraction Processed',float(i_user+1)/n_users, end="") 
#     # This horrible line first collapses each review of a corresponding user reviews into a list
#     # of lists, and then collapses the list of sentences to a long list of words
#     reviews_merged_user[user_id] = " ".join(list(chain.from_iterable( 
#                                     chain.from_iterable( review.cleaned_tokenized[review.user_id==user_id] ))))    
# docs_users = reviews_merged_user.values()
# print()
# print("Merging Done...")

# with open('../output/docs_bars_users.pickle', 'wb') as f: 
#     pickle.dump(docs_users, f)









    



 Fraction Processed 0.999817553761
Merging Done...



In [26]:

    
# Flatten the reviews, so each review is just a single list of words.
docs_reviews = [" ".join(list(chain.from_iterable(rev))) for rev in review.cleaned_tokenized.values[:n_reviews]]

with open('../output/docs_reviews.pickle', 'wb') as f: 
    pickle.dump(docs_reviews, f)



In [ ]:

LDA Across Bars and Businesses



In [ ]:



In [27]:

    
%autoreload 2 
import sys
sys.path.append('../vectorsearch/')
import LDA
reload(LDA)
n_topics=30
n_features=10000
max_df=.75
min_df=3
max_iter=10
alpha=6./n_topics



In [ ]:

    
# Train the bar set over businesses
#doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))

lda_bus = LDA.LDA(alpha=alpha, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
lda_bus.vectorizecounts(docs_bus)
lda_bus.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_bus.pickle', lda_bus)
# Now can









    



Extracting tf features for LDA...
done in 14.924s.
Fitting LDA models with tf features, n_samples=5620 and n_features=10000...



In [7]:

    
# Train the bar set over users

# doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
# lda_user = LDA.LDA(n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
# lda_user.vectorizecounts(docs_users)
# lda_user.fitLDA()
# LDA.SaveLDAModel('../output/LDA_model_user.pickle', lda_user)









    



Extracting tf features for LDA...
done in 10.863s.
Fitting LDA models with tf features, n_samples=82216 and n_features=10000...
done in 653.609s.



In [12]:

    
# Train the bar set over users
lda_reviews = pickle.load(open('../output/docs_reviews.pickle', 'rb'))
lda_reviews = LDA.LDA(alpha=alpha, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
lda_reviews.vectorizecounts(docs_reviews)
lda_reviews.fitLDA()
LDA.SaveLDAModel('../output/LDA_model_reviews.pickle', lda_reviews)









    



Extracting tf features for LDA...
done in 11.443s.
Fitting LDA models with tf features, n_samples=186751 and n_features=10000...
done in 496.987s.



In [51]:

    
# doc_users = pickle.load(open('../output/docs_bars_users.pickle', 'rb'))
# lda_user = LDA.LDA(n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter,)
# lda_user.vectorizecounts(docs_users)
# lda_user.fitLDA()
# LDA.SaveLDAModel('../output/LDA_model_user.pickle', lda_user)


#lda_bus.print_top_words(10)

#.get_doc_topics(doc_users[10:12])

Generate the training and test sets



In [ ]:

    
import sys
sys.path.append('../vectorsearch/')
import LDA

bus_lda = LDA.LoadLDAModel('../output/LDA_model_bus.pickle')



In [ ]:



In [ ]:

    
# The topic vector for a given business is given by this dataframe. 
bus_lda_ids = pickle.load(open('../output/bus_ids_bars_LDA.pickle', 'rb'))
bus_vectors = pd.DataFrame()
bus_vectors['business_id'] = bus_lda_ids
transformed = bus_lda.lda.transform(bus_lda.tf)



In [ ]:

    
print(transformed.shape)
print(len(bus_vectors))

bus_vectors['topic_vector'] = [bus_topic_vec for bus_topic_vec in transformed]
normed_topic_vecs = map(lambda topic_vec: topic_vec/sqrt(np.dot(topic_vec, topic_vec)),
                        bus_vectors.topic_vector) 


bus_vectors.topic_vector = normed_topic_vecs

bus_vectors.to_pickle('../output/business_LDA_vectors.pickle')



In [ ]:

    
# Visualizationlda_reviews.get_doc_topics(doc_reviews[10:20])



In [1]:

    
# import pyLDAvis
# import pandas as pd
# import funcy as fp
# from pyLDAvis import prepare as vis_prepare

# def _extract_data(docs, vect, lda):
#     #LDA scikit-learn implementation seems to have buggy code.
#     #Topic_term_dists and doc_topic_dists isn't accummulated to 1.
#     #Hence norm function implemented to normalize the distributions.
#     norm = lambda data: pd.DataFrame(data).div(data.sum(1),axis=0).values
#     vected = vect.fit_transform(docs)
#     doc_topic_dists = norm(lda.fit_transform(vected))
    
#     return lda,vect, dict(
#                       doc_lengths = docs.str.len(),
#                       vocab = vect.get_feature_names(),
#                       term_frequency = vected.sum(axis=0).tolist()[0],
#                       topic_term_dists = norm(lda.components_),
#                       doc_topic_dists = doc_topic_dists)

# def prepare(docs, vect, lda, **kwargs):
#     """Create Prepared Data from sklearn's vectorizer and Latent Dirichlet
#     Application.

#     Parameters
#     ----------
#     docs : Pandas Series.
#         Documents to be passed as an input.
#     vect : Scikit-Learn Vectorizer (CountVectorizer,TfIdfVectorizer).
#         vectorizer to convert documents into matrix sparser
#     lda  : sklearn.decomposition.LatentDirichletAllocation.
#         Latent Dirichlet Allocation

#     **kwargs: Keyword argument to be passed to pyLDAvis.prepare()


#     Returns
#     -------
#     prepared_data : PreparedData
#           the data structures used in the visualization


#     Example
#     --------
#     For example usage please see this notebook:
#     http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

#     See
#     ------
#     See `pyLDAvis.prepare` for **kwargs.
#     """
    
#     opts = fp.merge(_extract_data(docs, vect, lda)[2], kwargs)

#     return vis_prepare(**opts)

# vis_data = prepare(docs, tf_vectorizer, lda)

# #



In [ ]:



In [13]:

    
import sys
sys.path.append('../vectorsearch/')
import LDA

bus_lda = LDA.LoadLDAModel('../output/LDA_model_bus.pickle')



In [15]:

    
bus_lda.lda.n_jobs = 1



In [ ]:

	business_id	date	review_id	stars	text	type	user_id	votes_cool	votes_funny	votes_useful	cleaned_tokenized
84301	02ef18a93c6b829f0c78790ce5709a3887fcd139	NaN	9cee722c-5489-46a7-b0cf-4b2c61e6a527	4.0	So my friends and I came one night, bringing m...	NaN	5291adc8-90e2-49ae-ad7b-57794f6c9a2c	NaN	NaN	NaN	[[friend, came, one, night, bringing, good, na...
84302	02ef18a93c6b829f0c78790ce5709a3887fcd139	NaN	773badd2-fd39-430a-ab32-22597ce1d76b	5.0	Everyone has a little drag queen inside of the...	NaN	5a921de7-ca35-4ca7-9125-4813cb88b1c2	NaN	NaN	NaN	[[everyone, a-little, drag, queen, inside, pla...
84303	02ef18a93c6b829f0c78790ce5709a3887fcd139	NaN	1ca8bee7-dd87-487e-bad1-2310f874bf5e	3.0	the venue itself is amazing. it's huge! Sever...	NaN	10d6c7d4-c641-4db0-9f98-ca384ee69338	NaN	NaN	NaN	[[venue, amazing], [huge], [several, floor, eq...
84304	02ef18a93c6b829f0c78790ce5709a3887fcd139	NaN	83512718-a8a5-4a87-9d07-7badab3e32ae	3.0	Ooooo Ennis loved this place! S/he felt the co...	NaN	deb75413-f53d-4f35-a403-d7d0048e2c97	NaN	NaN	NaN	[[ooooo, loved, place], [felt, competition, ve...
84305	02ef18a93c6b829f0c78790ce5709a3887fcd139	NaN	83f4d507-9524-47d0-a337-7e67ba8093d5	3.0	Choosing a star rating for this is difficult a...	NaN	67290bdd-9cfb-4200-9586-3e04d62b6e02	NaN	NaN	NaN	[[choosing, star, rating, difficult, at-best],...